library("here")
## here() starts at /Users/ayu.3646/Homework1
chocolate <- readRDS(here("data", "chocolate.RDS"))

Introduction

This is my solution for Project 1

library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
glimpse(chocolate)
## Rows: 2,530
## Columns: 10
## $ ref                              <dbl> 2454, 2458, 2454, 2542, 2546, 2546, 2…
## $ company_manufacturer             <chr> "5150", "5150", "5150", "5150", "5150…
## $ company_location                 <chr> "U.S.A.", "U.S.A.", "U.S.A.", "U.S.A.…
## $ review_date                      <dbl> 2019, 2019, 2019, 2021, 2021, 2021, 2…
## $ country_of_bean_origin           <chr> "Tanzania", "Dominican Republic", "Ma…
## $ specific_bean_origin_or_bar_name <chr> "Kokoa Kamili, batch 1", "Zorzal, bat…
## $ cocoa_percent                    <chr> "76%", "76%", "76%", "68%", "72%", "8…
## $ ingredients                      <chr> "3- B,S,C", "3- B,S,C", "3- B,S,C", "…
## $ most_memorable_characteristics   <chr> "rich cocoa, fatty, bready", "cocoa, …
## $ rating                           <dbl> 3.25, 3.50, 3.75, 3.00, 3.00, 3.25, 3…

Solution 1

library(dplyr)
library(ggplot2)
hist(chocolate$rating)

hist(chocolate$rating, breaks = seq(min(chocolate$rating), max(chocolate$rating), length.out = 11))

hist(chocolate$rating, breaks = seq(min(chocolate$rating), max(chocolate$rating), length.out = 16))

hist(chocolate$rating, breaks = seq(min(chocolate$rating), max(chocolate$rating), length.out = 21))

hist(chocolate$rating, breaks = seq(min(chocolate$rating), max(chocolate$rating), length.out = 26))

head(chocolate, 5)
## # A tibble: 5 × 10
##     ref company_manufacturer company_location review_date country_of_bean_origin
##   <dbl> <chr>                <chr>                  <dbl> <chr>                 
## 1  2454 5150                 U.S.A.                  2019 Tanzania              
## 2  2458 5150                 U.S.A.                  2019 Dominican Republic    
## 3  2454 5150                 U.S.A.                  2019 Madagascar            
## 4  2542 5150                 U.S.A.                  2021 Fiji                  
## 5  2546 5150                 U.S.A.                  2021 Venezuela             
## # ℹ 5 more variables: specific_bean_origin_or_bar_name <chr>,
## #   cocoa_percent <chr>, ingredients <chr>,
## #   most_memorable_characteristics <chr>, rating <dbl>

#I evaluated the histograms with different numbers of bins and found that the histogram with 15 bins provides the best balance between detail and clarity. With too few bins (10), the distribution appears too generalized, obscuring finer details. Conversely, too many bins (25) create a fragmented view that adds unnecessary noise. The 15-bin histogram offers a clear visualization of the overall distribution, capturing key features of the data without overwhelming the viewer.

Solution 2

table(chocolate$country_of_bean_origin)
## 
##             Australia                Belize                 Blend 
##                     3                    76                   156 
##               Bolivia                Brazil                 Burma 
##                    80                    78                     1 
##              Cameroon                 China              Colombia 
##                     3                     1                    79 
##                 Congo            Costa Rica                  Cuba 
##                    11                    43                    12 
##    Dominican Republic              DR Congo               Ecuador 
##                   226                     1                   219 
##           El Salvador                  Fiji                 Gabon 
##                     6                    16                     1 
##                 Ghana               Grenada             Guatemala 
##                    41                    19                    62 
##                 Haiti              Honduras                 India 
##                    30                    25                    35 
##             Indonesia           Ivory Coast               Jamaica 
##                    20                     7                    24 
##               Liberia            Madagascar              Malaysia 
##                     3                   177                     8 
##            Martinique                Mexico             Nicaragua 
##                     1                    55                   100 
##               Nigeria                Panama      Papua New Guinea 
##                     3                     9                    50 
##                  Peru           Philippines              Principe 
##                   244                    24                     1 
##           Puerto Rico                 Samoa              Sao Tome 
##                     7                     3                    14 
##   Sao Tome & Principe          Sierra Leone       Solomon Islands 
##                     2                     4                    10 
##             Sri Lanka             St. Lucia St.Vincent-Grenadines 
##                     2                    10                     1 
##              Sulawesi               Sumatra              Suriname 
##                     1                     1                     1 
##                Taiwan              Tanzania              Thailand 
##                     2                    79                     5 
##                Tobago                  Togo              Trinidad 
##                     2                     3                    42 
##                U.S.A.                Uganda               Vanuatu 
##                    33                    19                    13 
##             Venezuela               Vietnam 
##                   253                    73

Solution 3

country_Ecuador <- filter(chocolate, country_of_bean_origin == "Ecuador")

sd(country_Ecuador$rating)
## [1] 0.5122678
length(country_Ecuador$rating)
## [1] 219
mean(country_Ecuador$rating)
## [1] 3.164384
mean_rating <- mean(country_Ecuador$rating)
sd_rating <- sd(country_Ecuador$rating)
total_reviews <- length(country_Ecuador$rating)

ecuador_summary <- data.frame(
  mean = mean_rating,
  sd = sd_rating,
  total = total_reviews
)

print(ecuador_summary)
##       mean        sd total
## 1 3.164384 0.5122678   219

Solution 4

country_Ecuador <- filter(chocolate, country_of_bean_origin == "Ecuador")

location_avg_rating <- aggregate(rating ~ company_location, data = country_Ecuador, FUN = mean)
best_location <- location_avg_rating[which.max(location_avg_rating$rating), ]
print(best_location)
##   company_location rating
## 2        Australia 3.8125

Australia makes the best chocolate (or has the highest ratings on average) with beans from Ecuador.

Solution 5

avg_rating_by_country <- aggregate(rating ~ country_of_bean_origin, data = chocolate, FUN = mean)
sorted_avg_rating <- avg_rating_by_country[order(-avg_rating_by_country$rating), ]
top_3_countries <- head(sorted_avg_rating, 3)
print(top_3_countries)
##    country_of_bean_origin rating
## 55                 Tobago  3.625
## 8                   China  3.500
## 43    Sao Tome & Principe  3.500

Top 3 countries (for bean origin) having the highest ratings on average are Tobago, China, Sao Tome & Principle

Solution 6

review_counts <- chocolate %>%
  count(country_of_bean_origin)
countries_with_min_reviews <- review_counts %>%
  filter(n >= 10) %>%
  pull(country_of_bean_origin)
filtered_chocolate <- chocolate %>%
  filter(country_of_bean_origin %in% countries_with_min_reviews)
avg_rating_by_country <- filtered_chocolate %>%
  group_by(country_of_bean_origin) %>%
  summarise(mean_rating = mean(rating, na.rm = TRUE))
top_3_countries <- avg_rating_by_country %>%
  arrange(desc(mean_rating)) %>%
  slice_head(n = 3)
print(top_3_countries)
## # A tibble: 3 × 2
##   country_of_bean_origin mean_rating
##   <chr>                        <dbl>
## 1 Solomon Islands               3.45
## 2 Congo                         3.32
## 3 Cuba                          3.29

The top 3 countries with the highest average chocolate ratings are the Solomon Islands (3.45), Congo (3.32), and Cuba (3.29).

SOLUTION 7

library(dplyr)
country_counts <- chocolate %>%
  group_by(country_of_bean_origin) %>%
  summarise(total_reviews = n())
countries_50_reviews <- country_counts %>%
  filter(total_reviews >= 50)
filtered_chocolate <- chocolate %>%
  filter(country_of_bean_origin %in% countries_50_reviews$country_of_bean_origin)
table(filtered_chocolate$country_of_bean_origin)
## 
##             Belize              Blend            Bolivia             Brazil 
##                 76                156                 80                 78 
##           Colombia Dominican Republic            Ecuador          Guatemala 
##                 79                226                219                 62 
##         Madagascar             Mexico          Nicaragua   Papua New Guinea 
##                177                 55                100                 50 
##               Peru           Tanzania          Venezuela            Vietnam 
##                244                 79                253                 73
# table(chocolate$country_of_bean_origin)

library(forcats)

filtered_chocolate <- filtered_chocolate %>%
  mutate(percent_group = case_when(
    cocoa_percent < 60 ~ "<60%",
    cocoa_percent >= 60 & cocoa_percent < 70 ~ ">=60 to <70%",
    cocoa_percent >= 70 & cocoa_percent < 90 ~ ">=70 to <90%",
    cocoa_percent >= 90 ~ ">=90%"
  ))
filtered_chocolate <- filtered_chocolate %>%
  mutate(percent_group = fct_relevel(percent_group, "<60%", ">=60 to <70%", ">=70 to <90%", ">=90%"))
table(filtered_chocolate$percent_group)
## 
##         <60% >=60 to <70% >=70 to <90%        >=90% 
##           41          270         1682           14
head(filtered_chocolate, 20)
## # A tibble: 20 × 11
##      ref company_manufacturer company_location review_date
##    <dbl> <chr>                <chr>                  <dbl>
##  1  2454 5150                 U.S.A.                  2019
##  2  2458 5150                 U.S.A.                  2019
##  3  2454 5150                 U.S.A.                  2019
##  4  2546 5150                 U.S.A.                  2021
##  5   797 A. Morin             France                  2012
##  6   797 A. Morin             France                  2012
##  7  1015 A. Morin             France                  2013
##  8  1011 A. Morin             France                  2013
##  9  1011 A. Morin             France                  2013
## 10  1015 A. Morin             France                  2013
## 11  1019 A. Morin             France                  2013
## 12  1019 A. Morin             France                  2013
## 13  1011 A. Morin             France                  2013
## 14  1015 A. Morin             France                  2013
## 15  1019 A. Morin             France                  2013
## 16  1315 A. Morin             France                  2014
## 17  1315 A. Morin             France                  2014
## 18  1319 A. Morin             France                  2014
## 19  1319 A. Morin             France                  2014
## 20  1704 A. Morin             France                  2015
## # ℹ 7 more variables: country_of_bean_origin <chr>,
## #   specific_bean_origin_or_bar_name <chr>, cocoa_percent <chr>,
## #   ingredients <chr>, most_memorable_characteristics <chr>, rating <dbl>,
## #   percent_group <fct>
filtered_chocolate <- filtered_chocolate %>%
  mutate(percent_group = as.factor(percent_group))
filtered_chocolate <- filtered_chocolate %>%
  mutate(percent_group = fct_relevel(percent_group, "<60%", ">=60 to <70%", ">=70 to <90%", ">=90%"))
levels(filtered_chocolate$percent_group)
## [1] "<60%"         ">=60 to <70%" ">=70 to <90%" ">=90%"
library(ggplot2)

ggplot(filtered_chocolate, aes(x = percent_group, y = rating, fill = percent_group)) +
  geom_boxplot() +
  facet_wrap(~ country_of_bean_origin) +
  labs(title = "Boxplots of Ratings by Chocolate Percentage Groups",
       x = "Chocolate Percentage Group",
       y = "Rating") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

average_ratings <- filtered_chocolate %>%
  group_by(percent_group) %>%
  summarise(avg_rating = mean(rating, na.rm = TRUE))

print(average_ratings)
## # A tibble: 4 × 2
##   percent_group avg_rating
##   <fct>              <dbl>
## 1 <60%                2.66
## 2 >=60 to <70%        3.25
## 3 >=70 to <90%        3.22
## 4 >=90%               2.75

##>=60 but <70 has the highest average rating of 3.246 ##There is an overall agreement between countries.

Part 2 : Join Two Datasets together

##Solution

options(repos = c(CRAN = "https://cloud.r-project.org"))
if(!require(gapminder)) {
  install.packages("gapminder")
}
## Loading required package: gapminder
library(gapminder)
install.packages("gapminder")
## 
## The downloaded binary packages are in
##  /var/folders/p1/wpj3v66n52gd3xt34qd6ssm80000gn/T//Rtmp29CgFA/downloaded_packages
library(gapminder)

gapminder_continents <- gapminder %>%
  select(country, continent) %>%
  distinct()
chocolate_with_continent <- filtered_chocolate %>%
  left_join(gapminder_continents, by = c("country_of_bean_origin" = "country"))
head(chocolate_with_continent)
## # A tibble: 6 × 12
##     ref company_manufacturer company_location review_date country_of_bean_origin
##   <dbl> <chr>                <chr>                  <dbl> <chr>                 
## 1  2454 5150                 U.S.A.                  2019 Tanzania              
## 2  2458 5150                 U.S.A.                  2019 Dominican Republic    
## 3  2454 5150                 U.S.A.                  2019 Madagascar            
## 4  2546 5150                 U.S.A.                  2021 Venezuela             
## 5   797 A. Morin             France                  2012 Bolivia               
## 6   797 A. Morin             France                  2012 Peru                  
## # ℹ 7 more variables: specific_bean_origin_or_bar_name <chr>,
## #   cocoa_percent <chr>, ingredients <chr>,
## #   most_memorable_characteristics <chr>, rating <dbl>, percent_group <fct>,
## #   continent <fct>
library(dplyr)
country_counts <- chocolate_with_continent %>%
  group_by(country_of_bean_origin) %>%
  summarise(total_reviews = n())
chocolate_filtered <- chocolate_with_continent %>%
  filter(country_of_bean_origin %in% country_counts$country_of_bean_origin[country_counts$total_reviews >= 10] &
         country_of_bean_origin != "Blend")
table(chocolate_with_continent$country_of_bean_origin)
## 
##             Belize              Blend            Bolivia             Brazil 
##                 76                156                 80                 78 
##           Colombia Dominican Republic            Ecuador          Guatemala 
##                 79                226                219                 62 
##         Madagascar             Mexico          Nicaragua   Papua New Guinea 
##                177                 55                100                 50 
##               Peru           Tanzania          Venezuela            Vietnam 
##                244                 79                253                 73
sum(is.na(chocolate_filtered$continent))
## [1] 126
table(chocolate_filtered$country_of_bean_origin)
## 
##             Belize            Bolivia             Brazil           Colombia 
##                 76                 80                 78                 79 
## Dominican Republic            Ecuador          Guatemala         Madagascar 
##                226                219                 62                177 
##             Mexico          Nicaragua   Papua New Guinea               Peru 
##                 55                100                 50                244 
##           Tanzania          Venezuela            Vietnam 
##                 79                253                 73
library(ggplot2)

# Violin plot of ratings by continent
ggplot(chocolate_filtered, aes(x = continent, y = rating, fill = continent)) +
  geom_violin() +
  labs(title = "Violin Plot of Ratings by Continent",
       x = "Continent",
       y = "Rating") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Part 3 : Convert wide data into long data

# # install.packages(c("dplyr", "stringr", "tidyr"))
# 
library(dplyr)
library(stringr)
if (!require("dplyr", quietly = TRUE)) {
  print("dplyr is not installed")
} else {
  print("dplyr is installed")
}
## [1] "dplyr is installed"
chocolate <- chocolate %>%
  mutate(
    beans = ifelse(str_detect(ingredients, "B"), 1, 0),
    sugar = ifelse(str_detect(ingredients, "S"), 1, 0),
    cocoa_butter = ifelse(str_detect(ingredients, "C"), 1, 0),
    vanilla = ifelse(str_detect(ingredients, "V"), 1, 0),
    lecithin = ifelse(str_detect(ingredients, "L"), 1, 0),
    salt = ifelse(str_detect(ingredients, "Sa"), 1, 0)
  )

library(dplyr)
library(stringr)

chocolate <- chocolate %>%
  mutate(
    char_cocoa = ifelse(str_detect(most_memorable_characteristics, "cocoa"), 1, 0),
    char_sweet = ifelse(str_detect(most_memorable_characteristics, "sweet"), 1, 0),
    char_nutty = ifelse(str_detect(most_memorable_characteristics, "nutty"), 1, 0),
    char_creamy = ifelse(str_detect(most_memorable_characteristics, "creamy"), 1, 0),
    char_roasty = ifelse(str_detect(most_memorable_characteristics, "roasty"), 1, 0),
    char_earthy = ifelse(str_detect(most_memorable_characteristics, "earthy"), 1, 0),
    beans = ifelse(str_detect(ingredients, "B"), 1, 0),
    sugar = ifelse(str_detect(ingredients, "S"), 1, 0),
    cocoa_butter = ifelse(str_detect(ingredients, "C"), 1, 0),
    vanilla = ifelse(str_detect(ingredients, "V"), 1, 0),
    lecithin = ifelse(str_detect(ingredients, "L"), 1, 0),
    salt = ifelse(str_detect(ingredients, "Sa"), 1, 0)
  )
chocolate_summary <- chocolate %>%
  group_by(review_date) %>%
  summarize(
    mean_char_cocoa = mean(char_cocoa, na.rm = TRUE),
    mean_char_sweet = mean(char_sweet, na.rm = TRUE),
    mean_char_nutty = mean(char_nutty, na.rm = TRUE),
    mean_char_creamy = mean(char_creamy, na.rm = TRUE),
    mean_char_roasty = mean(char_roasty, na.rm = TRUE),
    mean_char_earthy = mean(char_earthy, na.rm = TRUE),
    mean_beans = mean(beans, na.rm = TRUE),
    mean_sugar = mean(sugar, na.rm = TRUE),
    mean_cocoa_butter = mean(cocoa_butter, na.rm = TRUE),
    mean_vanilla = mean(vanilla, na.rm = TRUE),
    mean_lecithin = mean(lecithin, na.rm = TRUE),
    mean_salt = mean(salt, na.rm = TRUE)
  )
print(chocolate_summary)
## # A tibble: 16 × 13
##    review_date mean_char_cocoa mean_char_sweet mean_char_nutty mean_char_creamy
##          <dbl>           <dbl>           <dbl>           <dbl>            <dbl>
##  1        2006          0.210           0.161           0.0323           0.242 
##  2        2007          0.342           0.0959          0.0411           0.233 
##  3        2008          0.109           0.130           0.152            0.0978
##  4        2009          0.146           0.154           0.154            0.0894
##  5        2010          0.218           0.1             0.145            0.0909
##  6        2011          0.172           0.110           0.117            0.129 
##  7        2012          0.0876          0.139           0.103            0.0722
##  8        2013          0.175           0.126           0.115            0.0710
##  9        2014          0.0607          0.0972          0.158            0.0486
## 10        2015          0.127           0.106           0.109            0.0423
## 11        2016          0.0922          0.171           0.157            0.0553
## 12        2017          0.133           0.0952          0.0667           0.0952
## 13        2018          0.180           0.118           0.0789           0.0439
## 14        2019          0.259           0.145           0.0725           0.0881
## 15        2020          0.284           0.160           0.0494           0.0370
## 16        2021          0.297           0.126           0.0971           0.0171
## # ℹ 8 more variables: mean_char_roasty <dbl>, mean_char_earthy <dbl>,
## #   mean_beans <dbl>, mean_sugar <dbl>, mean_cocoa_butter <dbl>,
## #   mean_vanilla <dbl>, mean_lecithin <dbl>, mean_salt <dbl>
chocolate_long <- chocolate_summary %>%
  pivot_longer(
    cols = starts_with("mean_"),
    names_to = "feature",
    values_to = "mean_score"
  )
print(chocolate_long)
## # A tibble: 192 × 3
##    review_date feature           mean_score
##          <dbl> <chr>                  <dbl>
##  1        2006 mean_char_cocoa       0.210 
##  2        2006 mean_char_sweet       0.161 
##  3        2006 mean_char_nutty       0.0323
##  4        2006 mean_char_creamy      0.242 
##  5        2006 mean_char_roasty      0.0484
##  6        2006 mean_char_earthy      0.0645
##  7        2006 mean_beans            1     
##  8        2006 mean_sugar            1     
##  9        2006 mean_cocoa_butter     0.933 
## 10        2006 mean_vanilla          0.717 
## # ℹ 182 more rows

PART 4 : DATA VISUALIZATION

library(ggplot2)

ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

PART 5 : Make the worst plot you can!

ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point(size = 3) +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

-1. Large, Messy Points: Using a large point size (e.g., size = 3) can clutter the plot, demonstrating the importance of adjusting point size to balance emphasis and clarity in visualizations. - It underscores the need to choose an optimal point size that balances emphasis and clarity in visualizations.

ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") + 
geom_line(linetype = "dotted") +
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Distracting Dotted Lines: The line type = “dotted” can make it difficult to follow the data trends, especially when combined with messy points. It’ll help in choosing line styles that clearly convey information and ensure data visibility.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "italic", size = 8),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Distracting Title Style: An italicized plot.title might be hard to read, especially if it’s too large or stylistically inconsistent.
  • It emphasizes the need for a clear, consistent font style to improve comprehension and maintain focus on the data.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, color = "chartreuse"),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Using “chartreuse” - for the subtitle color makes the text visually jarring and difficult to focus on due to its intense brightness and clashing with typical plot backgrounds.
  • Highlights the importance of choosing balanced, harmonious colors to ensure text is legible and enhances the overall visual appeal of a plot.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
   plot.caption = element_text(size = 5, hjust = 0.5, color = "darkorange", face = "italic"),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Small, poorly readable text with a garish color and italic style : This makes the caption text small, which is hard to read, uses a bright and harsh color that clashes with most backgrounds, and applies an italic style that adds to the readability issues.
  • Recognizing how small, bright, and italicized text can hinder readability helps emphasize the need for clear, well-contrasted, and appropriately sized text.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "~~Mean Chocolate Feature Scores~~ *Over* **Time** 🌟🎉",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Overly stylized with excessive symbols and formatting : This version uses excessive styling with strikethroughs, italics, bold text, and emojis, which can be distracting and take away from the clarity of the title. -This will help in creating effective data visualizations by emphasizing the importance of simplicity and focus in text elements.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
   y = "Score (mean for every single chocolate ever reviewed)"
, 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Overly verbose and lengthy : This label is excessively detailed and unnecessarily long, making it cumbersome to read.
  • An overly verbose label highlights the importance of clear and concise labeling to ensure effective communication in data visualization.

###Part 6: Make my plot a better plot!

ggplot(chocolate_long, aes(x = review_date, y = mean_score, fill = feature)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Mean Chocolate Feature Scores by Year",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.position = "right",
    legend.text = element_text(size = 10),
    panel.background = element_rect(),
    panel.spacing = unit(1, "lines")
  )

    1. Bar Chart Conversion: I transformed the previous plot into a bar chart to better visualize the distribution of categorical data. The bar chart simplifies the comparison of different categories by displaying their frequencies or sums clearly, making patterns and differences more apparent.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 12),
    panel.background = element_rect(fill = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Angled x axis- I rotated the x-axis labels by 45 degrees to prevent overlap and enhance readability. This adjustment ensures that longer labels are fully visible and easier to read, improving the overall clarity of the plot.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point(size = 2, alpha = 0.7) +  
  geom_smooth(se = FALSE, method = "loess", color = "black") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(size = 10, face = "bold"),
    strip.background = element_rect(fill = "lightgrey"),
    panel.spacing = unit(2, "lines"),  
    panel.background = element_rect(fill = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Increased spacing between facets for better clarity - Increased the spacing between facets to avoid overcrowding and improve readability, making it easier to distinguish between different facets.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point(size = 2, alpha = 0.7) + 
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(size = 10, face = "bold"),
    panel.background = element_rect(fill = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Increased point size and transparency to make data points more visible
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point(size = 2, alpha = 0.7) +  
  geom_smooth(se = FALSE, method = "loess", color = "black") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(size = 10, face = "bold"),
    panel.background = element_rect(fill = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Changed smooth line color for better contrast
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point() +
  geom_smooth(se = FALSE, method = "loess") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(face = "bold", size = 10),
    panel.spacing = unit(2, "lines") 
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Increased spacing between facets to reduce visual clutter and make each individual plot more distinct. This adjustment provides clearer separation between the plots, allowing for easier comparison and reducing the likelihood of overlapping content.
ggplot(chocolate_long, aes(x = review_date, y = mean_score, color = feature)) +
  geom_point(shape = 17) +  
  geom_smooth(se = FALSE, method = "loess", color = "black") +  
  facet_wrap(~ feature, scales = "free_y") +  
  labs(
    title = "Mean Chocolate Feature Scores Over Time",
    subtitle = "Visualizing the trends in characteristics and ingredients across review years",
    x = "Year of Review",  
    y = "Mean Score", 
    caption = "Data visualization by: AYUSHI GUPTA"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 12, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 1),
    strip.text = element_text(size = 10, face = "bold"),
    panel.background = element_rect(fill = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
## `geom_smooth()` using formula = 'y ~ x'

    1. Changed point shape to triangle - I modified the point shape to triangles to enhance visual distinction and add a unique element to the plot. This change helps differentiate the data points more clearly from other graphical elements, improving overall readability and visual appeal.